2¶

Question:

Please do classification by two different algorithms with one dataset. Select one dataset public which consists of more than 10 features. Please do some tasks Exploratory Data Analysis (EDA), feature engineering and some pre-processing if you feel need it. Please explain each your task results

Answer

In [ ]:
import pandas as pd
pd.set_option('future.no_silent_downcasting',True)

dt = pd.read_csv('./data.csv')
In [ ]:
# Descripbe Data Shape
print("Data Shape")
print(dt.shape)
print("--------------")

# Describe overall data
print("Data Info")
print(dt.info(memory_usage=False))
print("--------------")

print("Data Description")
print(dt.describe())
print("--------------")
Data Shape
(1125, 13)
--------------
Data Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1125 entries, 0 to 1124
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   1125 non-null   int64  
 1   id      1125 non-null   int64  
 2   fea_1   1125 non-null   int64  
 3   fea_2   976 non-null    float64
 4   fea_3   1125 non-null   int64  
 5   fea_4   1125 non-null   float64
 6   fea_5   1125 non-null   int64  
 7   fea_6   1125 non-null   int64  
 8   fea_7   1125 non-null   int64  
 9   fea_8   1125 non-null   int64  
 10  fea_9   1125 non-null   int64  
 11  fea_10  1125 non-null   int64  
 12  fea_11  1125 non-null   float64
dtypes: float64(3), int64(10)None
--------------
Data Description
             label            id        fea_1        fea_2        fea_3  \
count  1125.000000  1.125000e+03  1125.000000   976.000000  1125.000000   
mean      0.200000  5.783677e+07     5.482667  1283.911373     2.333333   
std       0.400178  1.817150e+06     1.383338    51.764022     0.878773   
min       0.000000  5.498235e+07     1.000000  1116.500000     1.000000   
25%       0.000000  5.499050e+07     4.000000  1244.000000     1.000000   
50%       0.000000  5.898975e+07     5.000000  1281.500000     3.000000   
75%       0.000000  5.899799e+07     7.000000  1314.500000     3.000000   
max       1.000000  5.900624e+07     7.000000  1481.000000     3.000000   

              fea_4        fea_5        fea_6        fea_7        fea_8  \
count  1.125000e+03  1125.000000  1125.000000  1125.000000  1125.000000   
mean   1.208836e+05     1.928889    10.872000     4.832889   100.802667   
std    8.844523e+04     0.257125     2.676437     2.971182    11.988955   
min    1.500000e+04     1.000000     3.000000    -1.000000    64.000000   
25%    7.200000e+04     2.000000     8.000000     5.000000    90.000000   
50%    1.020000e+05     2.000000    11.000000     5.000000   105.000000   
75%    1.390000e+05     2.000000    11.000000     5.000000   111.000000   
max    1.200000e+06     2.000000    16.000000    10.000000   115.000000   

             fea_9         fea_10       fea_11  
count  1125.000000    1125.000000  1125.000000  
mean      4.195556  164618.495111   134.999004  
std       0.855679  152520.488281   112.616798  
min       1.000000   60000.000000     1.000000  
25%       3.000000   60044.000000     1.000000  
50%       4.000000   72000.000000   173.205081  
75%       5.000000  151307.000000   202.484567  
max       5.000000  650070.000000   707.106781  
--------------
In [ ]:
from ydata_profiling import ProfileReport

ProfileReport(dt, title="Profiling Report")
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[ ]:

Data preprocessing¶

In [ ]:
dt.columns
Out[ ]:
Index(['label', 'id', 'fea_1', 'fea_2', 'fea_3', 'fea_4', 'fea_5', 'fea_6',
       'fea_7', 'fea_8', 'fea_9', 'fea_10', 'fea_11'],
      dtype='object')
In [ ]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

df = dt.copy()
df = df[['label', 'fea_1', 'fea_3', 'fea_4', 'fea_5', 'fea_6', 'fea_7', 'fea_8', 'fea_9', 'fea_10', 'fea_11']]

# Normalisation using min_max scaller
mms = MinMaxScaler()
for i in df.drop(columns=['label']).columns.to_list():
    df[i] = mms.fit_transform(df[[i]])


# Drop null value
df = df.dropna()

# split to training and test
x = df.drop(columns=['label'])
y = df['label'].astype('int')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)

Data processing¶

In [ ]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score, precision_score

clf_t = DecisionTreeClassifier()

clf_t.fit(x_train, y_train)

y_pred_t = clf_t.predict(x_test)

print(confusion_matrix(y_test, y_pred_t))
print(f"Accuracy  : {accuracy_score(y_test, y_pred_t)}")
print(f"Precision : {precision_score(y_test, y_pred_t)}")
print(f"Recall    : {recall_score(y_test, y_pred_t)}")
print(f"F1-Score  : {f1_score(y_test, y_pred_t)}")
[[202  66]
 [ 55  15]]
Accuracy  : 0.6420118343195266
Precision : 0.18518518518518517
Recall    : 0.21428571428571427
F1-Score  : 0.1986754966887417
In [ ]:
from sklearn.svm import SVC

clf_s = SVC()

clf_s.fit(x_train, y_train)

y_pred_s = clf_s.predict(x_test)

print(confusion_matrix(y_test, y_pred_s))
print(f"Accuracy  : {accuracy_score(y_test, y_pred_s)}")
# print(f"Precision : {precision_score(y_test, y_pred_s)}")
# print(f"Recall    : {recall_score(y_test, y_pred_s)}")
# print(f"F1-Score  : {f1_score(y_test, y_pred_s)}")
[[268   0]
 [ 70   0]]
Accuracy  : 0.7928994082840237